knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library (readr)
library(stringr)
library(ggplot2)
library(plotly)
library(dplyr)
library(magrittr)
library(scales)
library(shiny)
library(kableExtra)
library(Hmisc)
library(glue)
urlfile="https://raw.githubusercontent.com/Bioinformatics-Research-Network/skill-assessments/main/R%20for%20Data%20Science/gapminder_clean.csv"
mydata<-read_csv(url(urlfile))
names(mydata) <-str_replace_all(names(mydata), c(" " = "." ))
This is a report of the analysis of gapminder_clean.csv data.
plot_1962 <- ggplot(mydata_1962, aes(x=`CO2.emissions.(metric.tons.per.capita)` ,y=gdpPercap)) +
geom_point(color = "firebrick") +
ggtitle("The correlation between CO2 emissions and GDP per capita in year 1962") +
labs(y= "GDP per capita", x = expression("CO2 emissions (metric tons per capita)")) +
theme(axis.title.x = element_text(vjust = 0, size = 15),
axis.title.y = element_text(vjust = 2, size = 15),
axis.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5))
print(plot_1962)
Pearson’s correlation of CO2 emissions and GDP per capita in 1962 was calculated and the resuls are presented below.
cor.test(mydata_1962$`CO2.emissions.(metric.tons.per.capita)`, mydata_1962$gdpPercap,)##
## Pearson's product-moment correlation
##
## data: mydata_1962$`CO2.emissions.(metric.tons.per.capita)` and mydata_1962$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
The correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap equals 0.9260817.
The associated p-value equals < 2.2e-16.
all_years <- unique(mydata$Year)
year_cor_co2_gdp <- setNames(data.frame(matrix(ncol = 2, nrow = 0)),
c("Year", "Correlation"))
for (year in all_years) {
subset <- mydata %>%
filter(Year == year)
correlation <- cor(subset$`CO2.emissions.(metric.tons.per.capita)`,
subset$gdpPercap, use = "complete.obs")
year_cor_co2_gdp[nrow(year_cor_co2_gdp) + 1,] <- c(year, correlation)
}
year_cor_co2_gdp[order(year_cor_co2_gdp$Correlation, decreasing = TRUE),] %>%
kbl() %>%
kable_material(c("striped", "hover"))| Year | Correlation | |
|---|---|---|
| 2 | 1967 | 0.9387918 |
| 1 | 1962 | 0.9260817 |
| 3 | 1972 | 0.8428986 |
| 5 | 1982 | 0.8166384 |
| 6 | 1987 | 0.8095531 |
| 7 | 1992 | 0.8094316 |
| 8 | 1997 | 0.8081396 |
| 9 | 2002 | 0.8006421 |
| 4 | 1977 | 0.7928336 |
| 10 | 2007 | 0.7204169 |
The correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap is the strongest in the year 1967.
mydata_1967 <- mydata %>%
filter(Year == 1967)
plot_1967 <- ggplot(mydata_1967, aes(x=`CO2.emissions.(metric.tons.per.capita)` ,y=gdpPercap)) +
geom_point(aes(size = pop, colour = continent)) +
ggtitle("The correlation between CO2 emissions and GDP per capita in year 1967") +
labs(y= "GDP per capita", x = "CO2 emissions (metric tons per capita)") +
theme(axis.title.x = element_text(vjust = 0, size = 15),
axis.title.y = element_text(vjust = 2, size = 15),
axis.text = element_text(size = 10))
div(ggplotly(plot_1967), align = "center")
One-way ANOVA was chosen to determine what is the relationship between continent and energy use. It was chosen because the examined data had one categorical independent variable (continent) that had multiple levels (Asia, Europe, Africa, Americas, Oceania) and one quantitative dependent variable (energy use).
Results are presented below.
one_way_anova_1967 <- aov(`Energy.use.(kg.of.oil.equivalent.per.capita)` ~ continent, data = mydata_1967)
summary(one_way_anova_1967)## Df Sum Sq Mean Sq F value Pr(>F)
## continent 3 30161255 10053752 9.642 0.000334 ***
## Residuals 21 21895723 1042653
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 234 observations deleted due to missingness
The p-value is low (p < 0.001), it appears that depending on the continent, there is a difference in energy use.
Because two groups (Europe and Asia) were being compared, t-test was chosen to assess whether there is a significant difference between Europe and Asia with respect to imports of goods and services in the years after 1990.
Results are presented below.
mydata_1990 <- mydata %>%
filter(Year > 1990) %>%
filter(continent == "Europe" | continent == "Asia")
ttest_1990 <- t.test(`Imports.of.goods.and.services.(%.of.GDP)` ~ continent, data = mydata_1990)
ttest_1990##
## Welch Two Sample t-test
##
## data: Imports.of.goods.and.services.(%.of.GDP) by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
Because of high (>0.05) p-value it is concluded that there is no significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990.
pop_dens_avg <- setNames(data.frame(matrix(ncol = 2, nrow = 0)),
c("Country", "Average.population.density"))
all_countries <- unique(mydata$Country.Name)
for (country in all_countries) {
subset <- mydata %>%
filter(Country.Name == country)
average <- mean(as.numeric(subset$`Population.density.(people.per.sq..km.of.land.area)`), na.rm=TRUE)
pop_dens_avg[nrow(pop_dens_avg) + 1,] <- c(country, average)
}
pop_dens_avg$Average.population.density <- as.numeric(as.character(pop_dens_avg$Average.population.density))
pop_dens_avg <- pop_dens_avg[order(pop_dens_avg$Average.population.density, decreasing = TRUE),]
head(pop_dens_avg) %>%
kbl() %>%
kable_material(c("striped", "hover"))| Country | Average.population.density | |
|---|---|---|
| 145 | Macao SAR, China | 14732.035 |
| 163 | Monaco | 14089.900 |
| 101 | Hong Kong SAR, China | 5153.057 |
| 209 | Singapore | 4361.500 |
| 88 | Gibraltar | 2622.250 |
| 23 | Bermuda | 1132.780 |
Macao region in China has the highest ‘Population density (people per sq. km of land area)’ across all years. It is equal to 14732.035.
first_year <- head(all_years, n=1)
last_year <- tail(all_years, n=1)
print(glue("First measurment was taken in {first_year} and last one in {last_year}."))## First measurment was taken in 1962 and last one in 2007.
mydata_2007 <- mydata %>%
filter(Year == 2007)
exp_increase <- setNames(data.frame(matrix(ncol = 3, nrow = 0)),
c("Country", "Life.exp.increase.numerical",
"Life.exp.increase.percentage"))
for (country in all_countries) {
subset_1962 <- mydata_1962 %>%
filter(Country.Name == country)
subset_2007 <- mydata_2007 %>%
filter(Country.Name == country)
increase_num <- subset_2007$`Life.expectancy.at.birth,.total.(years)` - subset_1962$`Life.expectancy.at.birth,.total.(years)`
increase_perc <- round(subset_2007$`Life.expectancy.at.birth,.total.(years)` / subset_1962$`Life.expectancy.at.birth,.total.(years)`*100, digits=1)
if (length(increase_num)==0) {
increase_num <- NA
increase_perc <- NA
}
exp_increase[nrow(exp_increase) + 1,] <- c(country, increase_num, increase_perc)
}
exp_increase$Life.exp.increase.numerical <- as.numeric(as.character(exp_increase$Life.exp.increase.numerical))
exp_increase$Life.exp.increase.percentage <- as.numeric(as.character(exp_increase$Life.exp.increase.percentage))
The table below shows table ordered in descending order by column containing numerical life expectancy increase.
head(exp_increase[order(exp_increase$Life.exp.increase.numerical, decreasing = TRUE),]) %>%
kbl() %>%
kable_material(c("striped", "hover"))| Country | Life.exp.increase.numerical | Life.exp.increase.percentage | |
|---|---|---|---|
| 150 | Maldives | 36.91615 | 195.9 |
| 24 | Bhutan | 33.19895 | 200.3 |
| 238 | Timor-Leste | 31.08515 | 189.5 |
| 242 | Tunisia | 30.86076 | 171.2 |
| 182 | Oman | 30.82310 | 169.6 |
| 171 | Nepal | 30.59963 | 185.1 |
The table below shows table ordered in descending order by column containing percentage life expectancy increase.
head(exp_increase[order(exp_increase$Life.exp.increase.percentage, decreasing = TRUE),]) %>%
kbl() %>%
kable_material(c("striped", "hover"))| Country | Life.exp.increase.numerical | Life.exp.increase.percentage | |
|---|---|---|---|
| 24 | Bhutan | 33.19895 | 200.3 |
| 150 | Maldives | 36.91615 | 195.9 |
| 151 | Mali | 25.71346 | 190.1 |
| 238 | Timor-Leste | 31.08515 | 189.5 |
| 171 | Nepal | 30.59963 | 185.1 |
| 84 | Gambia, The | 25.90834 | 179.3 |
In the Maldives life expectancy has grown by 37 years, what is a growth of 196%. In Bhutan life expectancy has grown by 33 years, what is over 200%.